In [1]:
# -*- coding: utf-8 -*-
In [2]:
pip install plotly
Requirement already satisfied: plotly in /Users/duanxiaoran/anaconda3/lib/python3.11/site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in /Users/duanxiaoran/anaconda3/lib/python3.11/site-packages (from plotly) (8.2.2)
Note: you may need to restart the kernel to use updated packages.
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [4]:
df = pd.read_csv('/Users/duanxiaoran/Downloads/Global YouTube Statistics.csv', encoding='ISO-8859-1')
In [5]:
print(df.head())
   rank                    Youtuber  subscribers   video views  \
0     1                    T-Series    245000000  2.280000e+11   
1     2              YouTube Movies    170000000  0.000000e+00   
2     3                     MrBeast    166000000  2.836884e+10   
3     4  Cocomelon - Nursery Rhymes    162000000  1.640000e+11   
4     5                   SET India    159000000  1.480000e+11   

           category                       Title  uploads        Country  \
0             Music                    T-Series    20082          India   
1  Film & Animation               youtubemovies        1  United States   
2     Entertainment                     MrBeast      741  United States   
3         Education  Cocomelon - Nursery Rhymes      966  United States   
4             Shows                   SET India   116536          India   

  Abbreviation   channel_type  ...  subscribers_for_last_30_days  \
0           IN          Music  ...                     2000000.0   
1           US          Games  ...                           NaN   
2           US  Entertainment  ...                     8000000.0   
3           US      Education  ...                     1000000.0   
4           IN  Entertainment  ...                     1000000.0   

   created_year  created_month  created_date  \
0        2006.0            Mar          13.0   
1        2006.0            Mar           5.0   
2        2012.0            Feb          20.0   
3        2006.0            Sep           1.0   
4        2006.0            Sep          20.0   

   Gross tertiary education enrollment (%)    Population  Unemployment rate  \
0                                     28.1  1.366418e+09               5.36   
1                                     88.2  3.282395e+08              14.70   
2                                     88.2  3.282395e+08              14.70   
3                                     88.2  3.282395e+08              14.70   
4                                     28.1  1.366418e+09               5.36   

   Urban_population   Latitude  Longitude  
0       471031528.0  20.593684  78.962880  
1       270663028.0  37.090240 -95.712891  
2       270663028.0  37.090240 -95.712891  
3       270663028.0  37.090240 -95.712891  
4       471031528.0  20.593684  78.962880  

[5 rows x 28 columns]
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   rank                                     995 non-null    int64  
 1   Youtuber                                 995 non-null    object 
 2   subscribers                              995 non-null    int64  
 3   video views                              995 non-null    float64
 4   category                                 949 non-null    object 
 5   Title                                    995 non-null    object 
 6   uploads                                  995 non-null    int64  
 7   Country                                  873 non-null    object 
 8   Abbreviation                             873 non-null    object 
 9   channel_type                             965 non-null    object 
 10  video_views_rank                         994 non-null    float64
 11  country_rank                             879 non-null    float64
 12  channel_type_rank                        962 non-null    float64
 13  video_views_for_the_last_30_days         939 non-null    float64
 14  lowest_monthly_earnings                  995 non-null    float64
 15  highest_monthly_earnings                 995 non-null    float64
 16  lowest_yearly_earnings                   995 non-null    float64
 17  highest_yearly_earnings                  995 non-null    float64
 18  subscribers_for_last_30_days             658 non-null    float64
 19  created_year                             990 non-null    float64
 20  created_month                            990 non-null    object 
 21  created_date                             990 non-null    float64
 22  Gross tertiary education enrollment (%)  872 non-null    float64
 23  Population                               872 non-null    float64
 24  Unemployment rate                        872 non-null    float64
 25  Urban_population                         872 non-null    float64
 26  Latitude                                 872 non-null    float64
 27  Longitude                                872 non-null    float64
dtypes: float64(18), int64(3), object(7)
memory usage: 217.8+ KB
In [7]:
df.duplicated(keep=False)
Out[7]:
0      False
1      False
2      False
3      False
4      False
       ...  
990    False
991    False
992    False
993    False
994    False
Length: 995, dtype: bool
In [8]:
df.drop_duplicates(inplace = True)
In [9]:
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']
fig = px.pie(country_counts, names='Country', values='Count', title='Percentage of YouTubers in Different Countries')
fig.show()
In [10]:
us_youtubers = df[df['Country'] == 'United States']
In [11]:
video_type_counts = us_youtubers['category'].value_counts().reset_index()
video_type_counts.columns = ['Video Type', 'Count']
In [12]:
fig = px.bar(video_type_counts, x='Video Type', y='Count', title='U.S. YouTubers share of different types of videos')
fig.show()
In [13]:
uk_youtubers = df[df['Country'] == 'United Kingdom']

video_type_counts_uk = uk_youtubers['category'].value_counts().reset_index()
video_type_counts_uk.columns = ['Video Type', 'Count']

fig_uk = px.bar(video_type_counts_uk, x='Video Type', y='Count', title='UK YouTubers share of different types of videos')
fig_uk.show()
In [14]:
australia_youtubers = df[df['Country'] == 'Australia']

video_type_counts_australia = australia_youtubers['category'].value_counts().reset_index()
video_type_counts_australia.columns = ['Video Type', 'Count']

fig_australia = px.bar(video_type_counts_australia, x='Video Type', y='Count', title='AU YouTubers share of different types of videos')
fig_australia.show()
In [15]:
canada_youtubers = df[df['Country'] == 'Canada']

video_type_counts_canada = canada_youtubers['category'].value_counts().reset_index()
video_type_counts_canada.columns = ['Video Type', 'Count']

fig_canada = px.bar(video_type_counts_canada, x='Video Type', y='Count', title='Canada YouTubers share of different types of videos')
fig_canada.show()
In [16]:
selected_countries = ['United States', 'United Kingdom', 'Australia', 'Canada']
selected_youtubers = df[df['Country'].isin(selected_countries)]

video_type_income = selected_youtubers.groupby('category')['lowest_yearly_earnings'].sum().reset_index()
video_type_income.columns = ['Video Type', 'Total Income']

fig_income = px.bar(video_type_income, x='Video Type', y='Total Income', title='YouTubers total revenue from different video types in different countries')
fig_income.show()
In [17]:
# people & blog
In [18]:
people_blog_data = df[df['category'] == 'People & Blogs']

fig = px.scatter(people_blog_data, x='subscribers', y='lowest_yearly_earnings', 
                 title='People & Blogs video type revenue and number of subs',
                 labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'lowest earnings'})

fig.show()
In [20]:
people_blog_data = df[df['category'] == 'People & Blogs']

fig = px.scatter(people_blog_data, x='subscribers', y='lowest_yearly_earnings', 
                 title='People & Blogs video type revenue and number of subs',
                 labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'lowest earnings'},
                 trendline='ols')
fig.show()
In [21]:
# Gaming
gaming_data = df[df['category'] == 'Gaming']

fig_gaming = px.scatter(gaming_data, x='subscribers', y='lowest_yearly_earnings', 
                        title='Gaming',
                        labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
                        trendline='ols') 

fig_gaming.show()
In [22]:
# Music
music_data = df[df['category'] == 'Music']

fig_music = px.scatter(music_data, x='subscribers', y='lowest_yearly_earnings', 
                       title='Music',
                       labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
                       trendline='ols')  

fig_music.show()
In [23]:
# Entertainment
entertainment_data = df[df['category'] == 'Entertainment']

fig_entertainment = px.scatter(entertainment_data, x='subscribers', y='lowest_yearly_earnings', 
                               title='Entertainment',
                               labels={'subscribers': 'subs', 'lowest_yearly_earnings': 'earnings'},
                               trendline='ols') 

fig_entertainment.show()
In [24]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=2, subplot_titles=['Gaming', 'Music', 'Entertainment', 'People & Blogs'])

gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']

fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'], 
                         mode='markers', name='Gaming'),
              row=1, col=1)
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'], 
                         mode='markers', name='Music'),
              row=1, col=2)
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'], 
                         mode='markers', name='Entertainment'),
              row=2, col=1)
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'], 
                         mode='markers', name='People & Blogs'),
              row=2, col=2)

fig.update_layout(title_text='Revenue from different video types', showlegend=True)
fig.update_xaxes(title_text='subs', row=1, col=1)
fig.update_xaxes(title_text='subs', row=1, col=2)
fig.update_xaxes(title_text='subs', row=2, col=1)
fig.update_xaxes(title_text='subs', row=2, col=2)
fig.update_yaxes(title_text='earnings', row=1, col=1)
fig.update_yaxes(title_text='earnings', row=1, col=2)
fig.update_yaxes(title_text='earnings', row=2, col=1)
fig.update_yaxes(title_text='earnings', row=2, col=2)

fig.show()
In [26]:
fig = make_subplots(rows=1, cols=1, subplot_titles=['Revenue from different video types'])

gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']

fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'], 
                         mode='markers', name='Gaming', legendgroup='Gaming'))
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'], 
                         mode='markers', name='Music', legendgroup='Music'))
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'], 
                         mode='markers', name='Entertainment', legendgroup='Entertainment'))
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'], 
                         mode='markers', name='People & Blogs', legendgroup='People & Blogs'))

def add_trendline(data, name):
    slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings'])
    x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100)
    y_fit = intercept + slope * x_fit
    fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name))

add_trendline(gaming_data, 'Gaming')
add_trendline(music_data, 'Music')
add_trendline(entertainment_data, 'Entertainment')
add_trendline(people_blog_data, 'People & Blogs')

fig.update_xaxes(title_text='subs')
fig.update_yaxes(title_text='earnings')

fig.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 23
     20     y_fit = intercept + slope * x_fit
     21     fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name))
---> 23 add_trendline(gaming_data, 'Gaming')
     24 add_trendline(music_data, 'Music')
     25 add_trendline(entertainment_data, 'Entertainment')

Cell In[26], line 18, in add_trendline(data, name)
     17 def add_trendline(data, name):
---> 18     slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings'])
     19     x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100)
     20     y_fit = intercept + slope * x_fit

NameError: name 'linregress' is not defined
In [27]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy.stats import linregress

fig = make_subplots(rows=1, cols=1, subplot_titles=['Revenue from different video types'])

gaming_data = df[df['category'] == 'Gaming']
music_data = df[df['category'] == 'Music']
entertainment_data = df[df['category'] == 'Entertainment']
people_blog_data = df[df['category'] == 'People & Blogs']

fig.add_trace(go.Scatter(x=gaming_data['subscribers'], y=gaming_data['lowest_yearly_earnings'], 
                         mode='markers', name='Gaming', legendgroup='Gaming'))
fig.add_trace(go.Scatter(x=music_data['subscribers'], y=music_data['lowest_yearly_earnings'], 
                         mode='markers', name='Music', legendgroup='Music'))
fig.add_trace(go.Scatter(x=entertainment_data['subscribers'], y=entertainment_data['lowest_yearly_earnings'], 
                         mode='markers', name='Entertainment', legendgroup='Entertainment'))
fig.add_trace(go.Scatter(x=people_blog_data['subscribers'], y=people_blog_data['lowest_yearly_earnings'], 
                         mode='markers', name='People & Blogs', legendgroup='People & Blogs'))

def add_trendline(data, name):
    slope, intercept, r_value, p_value, std_err = linregress(data['subscribers'], data['lowest_yearly_earnings'])
    x_fit = np.linspace(data['subscribers'].min(), data['subscribers'].max(), 100)
    y_fit = intercept + slope * x_fit
    fig.add_trace(go.Scatter(x=x_fit, y=y_fit, mode='lines', name=f'{name} Trendline', legendgroup=name))

add_trendline(gaming_data, 'Gaming')
add_trendline(music_data, 'Music')
add_trendline(entertainment_data, 'Entertainment')
add_trendline(people_blog_data, 'People & Blogs')

fig.update_xaxes(title_text='subs')
fig.update_yaxes(title_text='earnings')

fig.show()
In [ ]: